1. Data

Information about the test:

## Warning: The `.dots` argument of `group_by()` is deprecated as of dplyr 1.0.0.
question description MATH_group
A1 properties of fractions A
A2 find intersection of lines A
A3 composition of functions B
A4 completing the square A
A5 trig double angle formula A
A6 trig wave function A
A7 graphical vector sum B
A8 compute angle between 3d vectors A
A9 simplify logs A
A10 identify graph of rational functions B
A11 summing arithmetic progression A
A12 find point with given slope A
A13 equation of tangent A
A14 find minimum gradient of cubic B
A15 find and classify stationary points of cubic A
A16 trig chain rule A
A17 chain rule A
A18 definite integral A
A19 area between curve and x-axis (in 2 parts) B
A20 product rule with given values B

Load the student scores for the test:

Show data summary

test_scores %>% skim()
Data summary
Name Piped data
Number of rows 3493
Number of columns 26
_______________________
Column type frequency:
character 5
numeric 21
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Start 2668 0.24 22 24 0 824 0
End 2668 0.24 22 24 0 825 0
Duration 2668 0.24 8 23 0 161 0
AnonID 0 1.00 5 8 0 3471 0
year 0 1.00 5 5 0 1 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
A1 0 1 4.25 1.51 0 5.00 5.0 5.00 5 <U+2581><U+2581><U+2582><U+2581><U+2587>
A2 0 1 4.23 1.73 0 5.00 5.0 5.00 5 <U+2581><U+2581><U+2581><U+2581><U+2587>
A3 0 1 3.20 2.40 0 0.00 5.0 5.00 5 <U+2585><U+2581><U+2581><U+2581><U+2587>
A4 0 1 3.67 1.56 0 3.00 4.0 5.00 5 <U+2582><U+2582><U+2582><U+2583><U+2587>
A5 0 1 3.97 2.02 0 5.00 5.0 5.00 5 <U+2582><U+2581><U+2581><U+2581><U+2587>
A6 0 1 1.57 1.89 0 0.00 0.0 2.00 5 <U+2587><U+2585><U+2581><U+2581><U+2583>
A7 0 1 3.45 2.24 0 0.00 5.0 5.00 5 <U+2583><U+2581><U+2581><U+2581><U+2587>
A8 0 1 3.03 2.44 0 0.00 5.0 5.00 5 <U+2585><U+2581><U+2581><U+2581><U+2587>
A9 0 1 4.16 1.87 0 5.00 5.0 5.00 5 <U+2582><U+2581><U+2581><U+2581><U+2587>
A10 0 1 3.14 2.08 0 2.50 5.0 5.00 5 <U+2583><U+2581><U+2583><U+2581><U+2587>
A11 0 1 3.89 1.67 0 2.50 5.0 5.00 5 <U+2581><U+2581><U+2582><U+2581><U+2587>
A12 0 1 3.78 2.06 0 4.00 5.0 5.00 5 <U+2583><U+2581><U+2581><U+2581><U+2587>
A13 0 1 3.53 2.05 0 2.00 5.0 5.00 5 <U+2582><U+2582><U+2581><U+2581><U+2587>
A14 0 1 2.50 1.99 0 0.00 2.0 5.00 5 <U+2587><U+2587><U+2581><U+2581><U+2587>
A15 0 1 3.35 2.13 0 1.25 5.0 5.00 5 <U+2583><U+2581><U+2582><U+2581><U+2587>
A16 0 1 4.23 1.81 0 5.00 5.0 5.00 5 <U+2582><U+2581><U+2581><U+2581><U+2587>
A17 0 1 4.36 1.67 0 5.00 5.0 5.00 5 <U+2581><U+2581><U+2581><U+2581><U+2587>
A18 0 1 3.33 2.36 0 0.00 5.0 5.00 5 <U+2585><U+2581><U+2581><U+2581><U+2587>
A19 0 1 2.33 2.49 0 0.00 0.0 5.00 5 <U+2587><U+2581><U+2581><U+2581><U+2587>
A20 0 1 1.86 2.42 0 0.00 0.0 5.00 5 <U+2587><U+2581><U+2581><U+2581><U+2585>
Total 0 1 67.83 23.04 0 57.00 72.5 84.75 100 <U+2581><U+2581><U+2583><U+2587><U+2587>

Data cleaning

  1. For students who took the test more than once, consider the attempt with the highest scores only and remove the others;

  2. Eliminate the students who scored three or more zeros in the 5 easiest questions in the second-half of the test; and

  3. Add the students scoring more than 30 marks in total back to the sample.

test_scores <- test_scores_unfiltered %>% 
  group_by(AnonID) %>% 
  slice_max(Total, n = 1) %>% 
  ungroup() %>% 
  rowwise() %>% 
  mutate(zeros_in_easiest_5 = sum(A11==0, A12==0, A13==0, A16==0, A17==0)) %>% 
  filter(zeros_in_easiest_5 <= 2 | Total >= 30) %>% 
  select(-zeros_in_easiest_5) %>% 
  ungroup()

# test_scores <- test_scores_unfiltered %>% 
#   group_by(AnonID) %>% 
#   slice_max(Total, n = 1) %>% 
#   ungroup() %>% 
#   filter(Total > 0)

bind_rows(
  "unfiltered" = test_scores_unfiltered %>% select(Total),
  "filtered" = test_scores %>% select(Total),
  .id = "dataset"
) %>% 
  mutate(dataset = fct_relevel(dataset, "unfiltered", "filtered")) %>% 
  ggplot(aes(x = Total)) +
  geom_histogram() +
  facet_wrap(vars(dataset)) +
  theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Data summary

The number of responses from each class:

test_scores %>% 
  tally() %>% 
  gt() %>% 
  data_color(
    columns = c("n"),
    colors = scales::col_numeric(palette = c("Blues"), domain = NULL)
  )
n
3218

Mean and standard deviation for each item:

test_scores %>% 
  select(-Start, -End, -Duration, -AnonID, -Total) %>% 
  group_by(year) %>% 
  skim_without_charts() %>% 
  select(-contains("character."), -contains("numeric.p"), -skim_type) %>% 
  rename(complete = complete_rate) %>% 
  # make the table wider, i.e. with separate columns for each year's results, with the year at the start of each column name
  pivot_wider(names_from = year, values_from = -c(skim_variable, year), names_glue = "{year}__{.value}") %>% 
  # put the columns in order by year
  select(sort(names(.))) %>% 
  select(skim_variable, everything()) %>% 
  # use GT to make the table look nice
  gt(rowname_col = "skim_variable") %>% 
  # group the columns from each year
  tab_spanner_delim(delim = "__") %>%
  fmt_number(columns = contains("numeric"), decimals = 2) %>%
  fmt_percent(columns = contains("complete"), decimals = 0) %>% 
  # change all the numeric.mean and numeric.sd column names to Mean and SD
  cols_label(
    .list = test_scores %>% select(year) %>% distinct() %>% transmute(col = paste0(year, "__numeric.mean"), label = "Mean") %>% deframe()
  ) %>% 
  cols_label(
    .list = test_scores %>% select(year) %>% distinct() %>% transmute(col = paste0(year, "__numeric.sd"), label = "SD") %>% deframe()
  ) %>%
  data_color(
    columns = contains("numeric.mean"),
    colors = scales::col_numeric(palette = c("Greens"), domain = NULL)
  )
13-16
complete n_missing Mean SD
A1 100% 0 4.42 1.30
A2 100% 0 4.43 1.51
A3 100% 0 3.38 2.34
A4 100% 0 3.89 1.35
A5 100% 0 4.21 1.83
A6 100% 0 1.70 1.91
A7 100% 0 3.70 2.11
A8 100% 0 3.24 2.39
A9 100% 0 4.46 1.55
A10 100% 0 3.35 1.98
A11 100% 0 4.15 1.41
A12 100% 0 4.07 1.84
A13 100% 0 3.81 1.87
A14 100% 0 2.70 1.94
A15 100% 0 3.60 2.00
A16 100% 0 4.55 1.44
A17 100% 0 4.68 1.22
A18 100% 0 3.59 2.25
A19 100% 0 2.51 2.50
A20 100% 0 2.01 2.45

2. Testing assumptions

Before applying IRT, we should check that the data satisfies the assumptions needed by the model. In particular, to use a 1-dimensional IRT model, we should have some evidence of unidimensionality in the test scores.

Inter-item correlations

If the test is unidimensional then we would expect student scores on pairs of items to be correlated.

This plot shows the correlations between scores on each pair of items:

item_scores <- test_scores %>% 
  select(starts_with("A"), -AnonID)

cor_ci <- psych::corCi(item_scores, plot = FALSE)

psych::cor.plot.upperLowerCi(cor_ci)

Checking for correlations that are not significantly different from 0, there are none:

cor_ci$ci %>% 
  as_tibble(rownames = "corr") %>% 
  filter(p > 0.05) %>% 
  arrange(-p) %>% 
  select(-contains(".e")) %>% 
  gt() %>% 
  fmt_number(columns = 2:4, decimals = 3)
corr lower upper p
A2-A17 −0.001 0.079 0.055
A3-A8 −0.001 0.071 0.054

The overall picture is that the item scores are well correlated with each other.

Dimensionality

structure <- check_factorstructure(item_scores)
n <- n_factors(item_scores)

Is the data suitable for Factor Analysis?

  • KMO: The Kaiser, Meyer, Olkin (KMO) measure of sampling adequacy suggests that data seems appropriate for factor analysis (KMO = 0.89).
    • Sphericity: Bartlett’s test of sphericity suggests that there is sufficient significant correlation in the data for factor analysis (Chisq(190) = 7850.38, p < .001).

Method Agreement Procedure:

The choice of 1 dimensions is supported by 8 (34.78%) methods out of 23 (t, p, Acceleration factor, R2, VSS complexity 1, Velicer’s MAP, TLI, RMSEA).

plot(n)

summary(n) %>% gt()
n_Factors n_Methods
1 8
2 2
3 3
4 5
12 1
13 1
19 3
#n %>% tibble() %>% gt()
fa.parallel(item_scores, fa = "fa")

## Parallel analysis suggests that the number of factors =  5  and the number of components =  NA

1 Factor

We use the factanal function to fit a 1-factor model.

Note that this function cannot handle missing data, so any NA scores must be set to 0 for this analysis.

fitfact <- factanal(item_scores,
                    factors = 1,
                    rotation = "varimax")
print(fitfact, digits = 2, cutoff = 0.3, sort = TRUE)
## 
## Call:
## factanal(x = item_scores, factors = 1, rotation = "varimax")
## 
## Uniquenesses:
##   A1   A2   A3   A4   A5   A6   A7   A8   A9  A10  A11  A12  A13  A14  A15  A16 
## 0.90 0.96 0.83 0.73 0.89 0.84 0.80 0.95 0.89 0.83 0.92 0.84 0.81 0.63 0.89 0.85 
##  A17  A18  A19  A20 
## 0.90 0.85 0.74 0.68 
## 
## Loadings:
##  [1] 0.52 0.61 0.51 0.56 0.31      0.41 0.33 0.40 0.45      0.34 0.41      0.40
## [16] 0.44 0.33 0.39 0.32 0.39
## 
##                Factor1
## SS loadings       3.29
## Proportion Var    0.16
## 
## Test of the hypothesis that 1 factor is sufficient.
## The chi square statistic is 1275.43 on 170 degrees of freedom.
## The p-value is 1.49e-168
load <- tidy(fitfact)

load %>% 
  select(question = variable, factor_loading = fl1) %>% 
  left_join(item_info, by = "question") %>% 
  ggplot(aes(x = factor_loading, y = 0, colour = MATH_group)) + 
    geom_point() + 
    geom_label_repel(aes(label = paste0("A", rownames(load))), show.legend = FALSE) +
    scale_colour_manual(values = MATH_colours) +
  scale_y_discrete() +
    labs(x = "Factor 1", y = NULL,
         title = "Standardised Loadings", 
         subtitle = "Based upon correlation matrix") +
    theme_minimal()
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

load %>% 
  select(question = variable, factor_loading = fl1) %>% 
  left_join(item_info, by = "question") %>% 
  arrange(-factor_loading) %>% 
  gt() %>%
  data_color(
    columns = contains("factor"),
    colors = scales::col_numeric(palette = c("Greens"), domain = NULL)
  ) %>%
  data_color(
    columns = contains("MATH"),
    colors = MATH_colours
  )
question factor_loading description MATH_group
A14 0.6106549 find minimum gradient of cubic B
A20 0.5646130 product rule with given values B
A4 0.5174262 completing the square A
A19 0.5088218 area between curve and x-axis (in 2 parts) B
A7 0.4476711 graphical vector sum B
A13 0.4360296 equation of tangent A
A3 0.4140260 composition of functions B
A10 0.4122722 identify graph of rational functions B
A6 0.3989275 trig wave function A
A12 0.3978407 find point with given slope A
A16 0.3874777 trig chain rule A
A18 0.3866565 definite integral A
A9 0.3370271 simplify logs A
A15 0.3341290 find and classify stationary points of cubic A
A5 0.3337256 trig double angle formula A
A17 0.3236824 chain rule A
A1 0.3088217 properties of fractions A
A11 0.2904018 summing arithmetic progression A
A8 0.2339609 compute angle between 3d vectors A
A2 0.2079086 find intersection of lines A

It is striking here that the MATH Group B questions are those that load most strongly onto this factor.

4 Factor

Here we also investigate the 4-factor solution, to see whether these factors are interpretable.

fitfact2 <- factanal(item_scores, factors = 4, rotation = "varimax")
print(fitfact2, digits = 2, cutoff = 0.3, sort = TRUE)
## 
## Call:
## factanal(x = item_scores, factors = 4, rotation = "varimax")
## 
## Uniquenesses:
##   A1   A2   A3   A4   A5   A6   A7   A8   A9  A10  A11  A12  A13  A14  A15  A16 
## 0.88 0.95 0.70 0.69 0.86 0.76 0.77 0.81 0.87 0.80 0.90 0.81 0.69 0.58 0.87 0.59 
##  A17  A18  A19  A20 
## 0.45 0.83 0.75 0.66 
## 
## Loadings:
##     Factor1 Factor2 Factor3 Factor4
## A3  0.54                           
## A16         0.60                   
## A17         0.73                   
## A13                 0.50           
## A1  0.33                           
## A2                                 
## A4  0.48                           
## A5                                 
## A6                          0.41   
## A7  0.37                           
## A8                          0.42   
## A9                                 
## A10 0.40                           
## A11                                
## A12                 0.32           
## A14 0.41            0.46           
## A15                                
## A18                                
## A19 0.37                           
## A20 0.48                           
## 
##                Factor1 Factor2 Factor3 Factor4
## SS loadings       1.82    1.14    1.01    0.80
## Proportion Var    0.09    0.06    0.05    0.04
## Cumulative Var    0.09    0.15    0.20    0.24
## 
## Test of the hypothesis that 4 factors are sufficient.
## The chi square statistic is 196.39 on 116 degrees of freedom.
## The p-value is 4.57e-06
load2 <- tidy(fitfact2)

load2_plot <- load2 %>%
  left_join(item_info, by = c("variable"= "question")) %>%
  ggplot(aes(x = fl1, y = fl2, colour = MATH_group, shape = MATH_group)) +
  geom_point() +
  geom_text_repel(aes(label = paste0("A", rownames(load2))), show.legend = FALSE, alpha = 0.6) +
  labs(
    x = "Factor 1 (of 4)",
    y = "Factor 2 (of 4)"
  ) +
  scale_colour_manual("MATH group", values = MATH_colours) +
  scale_shape_manual(name = "MATH group", values = c(19, 17)) +
  theme_minimal()

load2_plot +
  labs(
    title = "Standardised Loadings",
    subtitle = "Showing the first 2 factors of the 4-factor model"
  )

ggsave("output/uoe_4factor.pdf", units = "cm", width = 12, height = 8, dpi = 300,
       plot = load2_plot)
main_factors <- load2 %>% 
#  mutate(factorNone = 0.4) %>%  # add this to set the main factor to "None" where all loadings are below 0.4
  pivot_longer(names_to = "factor",
               cols = contains("fl")) %>% 
  mutate(value_abs = abs(value)) %>% 
  group_by(variable) %>% 
  top_n(1, value_abs) %>% 
  ungroup() %>% 
  transmute(main_factor = factor, variable)


load2 %>% 
  select(-uniqueness) %>% 
  # add the info about which is the main factor
  left_join(main_factors, by = "variable") %>%
  left_join(item_info %>% select(variable = question, description, MATH_group), by = "variable") %>% 
  arrange(main_factor) %>% 
  select(main_factor, everything()) %>% 
  # arrange adjectives by descending loading on main factor
  rowwise() %>% 
  mutate(max_loading = max(abs(c_across(starts_with("fl"))))) %>% 
  group_by(main_factor) %>% 
  arrange(-max_loading, .by_group = TRUE) %>% 
  select(-max_loading) %>% 
  # sort out the presentation
  rename("Main Factor" = main_factor,
         "Question" = variable) %>%
  mutate_at(
    vars(starts_with("fl")),
    ~ cell_spec(round(., digits = 3), bold = if_else(abs(.) > 0.4, T, F))
  ) %>% 
  kable(booktabs = T, escape = F, longtable = T) %>% 
  kableExtra::collapse_rows(columns = 1, valign = "top") %>%
  kableExtra::kable_styling(latex_options = c("repeat_header"))
Main Factor Question fl1 fl2 fl3 fl4 description MATH_group
fl1 A3 0.538 0.035 0.057 0.031 composition of functions B
A20 0.483 0.144 0.257 0.129 product rule with given values B
A4 0.482 0.049 0.185 0.197 completing the square A
A10 0.398 0.086 0.159 0.066 identify graph of rational functions B
A7 0.373 0.013 0.248 0.161 graphical vector sum B
A19 0.37 0.177 0.244 0.163 area between curve and x-axis (in 2 parts) B
A1 0.33 0.025 0.084 0.079 properties of fractions A
A9 0.247 0.15 0.072 0.202 simplify logs A
A2 0.166 0.03 0.074 0.114 find intersection of lines A
fl2 A17 0.028 0.727 0.126 0.056 chain rule A
A16 0.108 0.598 0.135 0.16 trig chain rule A
A18 0.19 0.244 0.162 0.223 definite integral A
fl3 A13 0.171 0.127 0.504 0.083 equation of tangent A
A14 0.406 0.122 0.46 0.161 find minimum gradient of cubic B
A12 0.173 0.103 0.323 0.208 find point with given slope A
A15 0.126 0.178 0.261 0.134 find and classify stationary points of cubic A
fl4 A8 0.022 0.062 0.083 0.418 compute angle between 3d vectors A
A6 0.194 0.074 0.17 0.415 trig wave function A
A5 0.24 0.085 0.08 0.254 trig double angle formula A
A11 0.21 0.113 0.044 0.212 summing arithmetic progression A

The first factor is dominated by questions that had previously been identified as MATH Group B, i.e. those that are somehow “non-standard” – either requiring students to recognise that a particular rule/procedure is applicable before applying it, or to apply it in an unusual way. This factor also includes Group A questions on “pre-calculus” topics (such as fractions, logarithms and trigonometry) that students had perhaps not explicitly practiced most recently.

The second factor is dominated by the two chain rule questions (A16 and A17), along with A18 which is a routine definite integral, suggesting this factor is related to routine calculus computations.

The third factor seems to be based on applying calculus techniques to cubic and quadratic curves, e.g. to find tangent lines or stationary points.

The fourth factor is dominated by the only two questions that require the use of a calculator (to compute trigonometric functions), but more generally seems to be based on non-calculus skills (vectors, trig, sequences).

3. Fitting IRT model

The mirt implementation of the graded partial credit model (gpmc) requires that the partial marks are consecutive integers. We therefore need to work around this by adjusting our scores into that form (e.g. replacing scores of 0, 2.5, 5 with 1, 2, 3), while keeping track of the true scores attached to each mark level so that we can properly compute expected scores later on.

# Determine the mark levels for each item
mark_levels <- item_scores %>% 
  pivot_longer(everything(), names_to = "item", values_to = "score") %>% 
  distinct() %>% 
  arrange(parse_number(item), score) %>% 
  group_by(item) %>%
  mutate(order = row_number()) %>% 
# Note that the convention used by mirt is for items that have only 2 levels (i.e. 0 marks or full marks),
# the columns are P.0 and P.1, while other items are indexed from 1, i.e. P.1, P.2, ...
# https://github.com/philchalmers/mirt/blob/accd2383b9a4d17a4cab269717ce98434900b62c/R/probtrace.R#L57
  mutate(level = case_when(
    max(order) == 2 ~ order - 1,
    TRUE ~ order * 1.0
  )) %>% 
  mutate(levelname = paste0(item, ".P.", level))

# Use the mark_levels table to replace scores with levels
# (first pivot the data to long form, make the replacement, then pivot back to wide again)
item_scores_levelled <- item_scores %>% 
  # temporarily add row identifiers
  mutate(row = row_number()) %>% 
  pivot_longer(cols = -row, names_to = "item", values_to = "score") %>% 
  left_join(mark_levels %>% select(item, score, level), by = c("item", "score")) %>% 
  select(-score) %>% 
  pivot_wider(names_from = "item", values_from = "level") %>% 
  select(-row)

Show model fitting output

fit_gpcm <- mirt(
  data = item_scores_levelled, # just the columns with question scores
  model = 1,          # number of factors to extract
  itemtype = "gpcm",  # generalised partial credit model
  SE = TRUE           # estimate standard errors
  )
## 
Iteration: 1, Log-Lik: -50878.045, Max-Change: 6.48115
Iteration: 2, Log-Lik: -47114.510, Max-Change: 1.37552
Iteration: 3, Log-Lik: -46095.960, Max-Change: 3.69663
Iteration: 4, Log-Lik: -45826.548, Max-Change: 0.91197
Iteration: 5, Log-Lik: -45676.120, Max-Change: 0.84981
Iteration: 6, Log-Lik: -45613.155, Max-Change: 0.30728
Iteration: 7, Log-Lik: -45588.844, Max-Change: 0.22273
Iteration: 8, Log-Lik: -45577.502, Max-Change: 0.09224
Iteration: 9, Log-Lik: -45571.850, Max-Change: 0.10991
Iteration: 10, Log-Lik: -45568.215, Max-Change: 0.04183
Iteration: 11, Log-Lik: -45566.238, Max-Change: 0.02481
Iteration: 12, Log-Lik: -45565.015, Max-Change: 0.01857
Iteration: 13, Log-Lik: -45564.348, Max-Change: 0.00998
Iteration: 14, Log-Lik: -45563.718, Max-Change: 0.00663
Iteration: 15, Log-Lik: -45563.365, Max-Change: 0.00618
Iteration: 16, Log-Lik: -45562.812, Max-Change: 0.00141
Iteration: 17, Log-Lik: -45562.801, Max-Change: 0.00175
Iteration: 18, Log-Lik: -45562.785, Max-Change: 0.00018
Iteration: 19, Log-Lik: -45562.785, Max-Change: 0.00017
Iteration: 20, Log-Lik: -45562.784, Max-Change: 0.00313
Iteration: 21, Log-Lik: -45562.771, Max-Change: 0.00065
Iteration: 22, Log-Lik: -45562.770, Max-Change: 0.00108
Iteration: 23, Log-Lik: -45562.765, Max-Change: 0.00187
Iteration: 24, Log-Lik: -45562.758, Max-Change: 0.00047
Iteration: 25, Log-Lik: -45562.758, Max-Change: 0.00086
Iteration: 26, Log-Lik: -45562.756, Max-Change: 0.00013
Iteration: 27, Log-Lik: -45562.755, Max-Change: 0.00036
Iteration: 28, Log-Lik: -45562.755, Max-Change: 0.00011
Iteration: 29, Log-Lik: -45562.754, Max-Change: 0.00028
Iteration: 30, Log-Lik: -45562.754, Max-Change: 0.00007
## 
## Calculating information matrix...

Local independence

We compute Yen’s \(Q_3\) (1984) to check for any dependence between items after controlling for \(\theta\). This gives a score for each pair of items, with scores above 0.2 regarded as problematic (see DeMars, p. 48).

residuals  %>% as.matrix() %>% 
  corrplot::corrplot(type = "upper")

This shows that most item pairs are independent, with only one pair showing cause for concern:

residuals %>%
  rownames_to_column(var = "item1") %>%
  as_tibble() %>% 
  pivot_longer(cols = starts_with("A"), names_to = "item2", values_to = "Q3_score") %>% 
  filter(abs(Q3_score) > 0.2) %>% 
  filter(parse_number(item1) < parse_number(item2)) %>%
  gt()
item1 item2 Q3_score
A16 A17 0.3234112

Items A16 and A17 are on the chain rule (e.g. differentiating \(\cos(4x^2+5)\) and \((3x^2-8)^3\) respectively), so it is perhaps unsurprising that students’ performance on these items is not entirely independent.

Given that this violation of the local independence assumption is very mild, we proceed using this model.

Model parameters

We augment the data with estimated abilities for each student, using mirt’s fscores() function.

test_scores_with_ability <- test_scores %>%
  mutate(F1 = fscores(fit_gpcm, method = "EAP"))

Next, we extract the IRT parameters.

coefs_gpcm <- coef(fit_gpcm, IRTpars = TRUE)

We use a modified version of the tidy_mirt_coeffs function to get all the parameter estimates into a tidy table:

tidy_mirt_coefs <- function(x){
  x %>%
    # melt the list element
    melt() %>%
    # convert to a tibble
    as_tibble() %>%
    # convert factors to characters
    mutate(across(where(is.factor), as.character)) %>%
    # only focus on rows where X2 is a, or starts with b (the parameters in the GPCM)
    filter(X2 == "a" | str_detect(X2, "^b")) %>%
    # in X1, relabel par (parameter) as est (estimate)
    mutate(X1 = if_else(X1 == "par", "est", X1)) %>%
    # turn into a wider data frame
    pivot_wider(names_from = X1, values_from = value) %>% 
    rename(par = X2)
}

# use head(., -1) to remove the last element, `GroupPars`, which does not correspond to a question
tidy_gpcm <- map_dfr(head(coefs_gpcm, -1), tidy_mirt_coefs, .id = "Question")
tidy_gpcm %>% 
  filter(par == "a") %>% 
  select(-par) %>% 
  rename_with(.fn = ~ paste0("a_", .x), .cols = -Question) %>% 
  left_join(
    tidy_gpcm %>% 
      filter(str_detect(par, "^b")),
    by = "Question"
  ) %>% 
  gt(groupname_col = "Question") %>%
  fmt_number(columns = contains("est|_"), decimals = 3) %>%
  data_color(
    columns = contains("a_"),
    colors = scales::col_numeric(palette = c("Greens"), domain = NULL)
  ) %>%
  data_color(
    columns = c("est", "CI_2.5", "CI_97.5"),
    colors = scales::col_numeric(palette = c("Blues"), domain = NULL)
  ) %>%
  tab_spanner(label = "Discrimination", columns = contains("a_")) %>%
  tab_spanner(label = "Difficulty", columns = c("par", "est", "CI_2.5", "CI_97.5"))
Discrimination Difficulty
a_est a_CI_2.5 a_CI_97.5 par est CI_2.5 CI_97.5
A1
0.6723084 0.5751015 0.7695153 b1 -2.38121101 -2.70997568 -2.0524463
0.6723084 0.5751015 0.7695153 b2 -2.77548483 -3.16478370 -2.3861860
A2
0.3653255 0.2924821 0.4381688 b1 2.05910709 1.24903148 2.8691827
0.3653255 0.2924821 0.4381688 b2 -8.64096737 -10.39837294 -6.8835618
A3
1.0846154 0.9651998 1.2040311 b1 -0.84507544 -0.94875959 -0.7413913
A4
0.2732681 0.2458136 0.3007226 b1 11.77129071 7.31779014 16.2247913
0.2732681 0.2458136 0.3007226 b2 -14.47047547 -18.90538068 -10.0355703
0.2732681 0.2458136 0.3007226 b3 3.21288929 1.76645622 4.6593224
0.2732681 0.2458136 0.3007226 b4 -7.39018750 -8.82461529 -5.9557597
0.2732681 0.2458136 0.3007226 b5 2.20674861 1.28297349 3.1305237
0.2732681 0.2458136 0.3007226 b6 -4.92479531 -5.84548305 -4.0041076
0.2732681 0.2458136 0.3007226 b7 1.56384346 0.89425267 2.2334342
0.2732681 0.2458136 0.3007226 b8 -3.60351533 -4.28831056 -2.9187201
0.2732681 0.2458136 0.3007226 b9 6.28006076 5.27958818 7.2805333
0.2732681 0.2458136 0.3007226 b10 -9.54712121 -10.80063019 -8.2936122
A5
1.0428722 0.9018365 1.1839079 b1 -1.91040229 -2.12197484 -1.6988297
A6
0.4239886 0.3758632 0.4721140 b1 0.75688597 0.53151886 0.9822531
0.4239886 0.3758632 0.4721140 b2 9.25602035 7.83195465 10.6800860
0.4239886 0.3758632 0.4721140 b3 -7.51330248 -8.90069421 -6.1259107
A7
0.6605959 0.5890772 0.7321146 b1 1.72144960 1.34525750 2.0976417
0.6605959 0.5890772 0.7321146 b2 -3.94060901 -4.42627718 -3.4549408
A8
0.5205972 0.4300310 0.6111634 b1 -1.24954110 -1.49216124 -1.0069210
A9
1.2705396 1.0926784 1.4484008 b1 -2.09133758 -2.30748029 -1.8751949
A10
0.6358122 0.5659312 0.7056931 b1 -0.83168927 -1.00206107 -0.6613175
0.6358122 0.5659312 0.7056931 b2 -1.09426103 -1.28539986 -0.9031222
A11
0.2864361 0.2446393 0.3282328 b1 -1.83145049 -2.73456002 -0.9283410
0.2864361 0.2446393 0.3282328 b2 -5.95004777 -6.93468767 -4.9654079
0.2864361 0.2446393 0.3282328 b3 9.62738676 7.78716429 11.4676092
0.2864361 0.2446393 0.3282328 b4 -14.06588786 -16.40049294 -11.7312828
A12
0.4460071 0.3929810 0.4990331 b1 0.92822050 0.47481364 1.3816274
0.4460071 0.3929810 0.4990331 b2 0.83113166 0.29046535 1.3717980
0.4460071 0.3929810 0.4990331 b3 -6.82961157 -7.74034997 -5.9188732
A13
0.4488647 0.3992331 0.4984964 b1 -0.92253172 -1.21262393 -0.6324395
0.4488647 0.3992331 0.4984964 b2 4.46010212 3.64908715 5.2711171
0.4488647 0.3992331 0.4984964 b3 -7.96232050 -9.02288332 -6.9017577
A14
0.5046730 0.4581541 0.5511918 b1 1.60380788 1.20855501 1.9990608
0.5046730 0.4581541 0.5511918 b2 -3.79461618 -4.22363829 -3.3655941
0.5046730 0.4581541 0.5511918 b3 8.76562793 7.40183062 10.1294252
0.5046730 0.4581541 0.5511918 b4 -3.78536183 -4.99989631 -2.5708273
0.5046730 0.4581541 0.5511918 b5 -4.34538545 -4.96947152 -3.7212994
A15
0.2308369 0.2003731 0.2613006 b1 8.36798985 6.79251330 9.9434664
0.2308369 0.2003731 0.2613006 b2 -6.60105721 -7.92255009 -5.2795643
0.2308369 0.2003731 0.2613006 b3 3.12297857 2.21083233 4.0351248
0.2308369 0.2003731 0.2613006 b4 -10.65871533 -12.24894228 -9.0684884
A16
1.7603387 1.5175207 2.0031568 b1 -1.87767406 -2.03647787 -1.7188702
A17
1.6518108 1.3915811 1.9120405 b1 -2.22573932 -2.44611061 -2.0053680
A18
0.9974632 0.8799068 1.1150197 b1 -1.12967653 -1.26055954 -0.9987935
A19
1.3729577 1.2395683 1.5063472 b1 -0.01895512 -0.08738595 0.0494757
A20
1.8097972 1.6347040 1.9848905 b1 0.33470755 0.27212472 0.3972904
tidy_gpcm %>% 
  write_csv("output/uoe_pre_gpcm-results.csv")

Information curves

theta <- seq(-6, 6, by=0.05)

info_matrix <- testinfo(fit_gpcm, theta, individual = TRUE)
colnames(info_matrix) <- item_info %>% pull(question)
item_info_data <- info_matrix %>% 
  as_tibble() %>% 
  bind_cols(theta = theta) %>% 
  pivot_longer(cols = -theta, names_to = "item", values_to = "info_y") %>% 
  left_join(item_info %>% select(item = question, MATH_group), by = "item") %>% 
  mutate(item = fct_reorder(item, parse_number(item)))

Test information curve

item_info_data %>% 
  group_by(theta) %>% 
  summarise(info_y = sum(info_y)) %>% 
  ggplot(aes(x = theta, y = info_y)) +
  geom_line() +
  labs(y = "Information") +
  theme_minimal()

ggsave("output/uoe_pre_info.pdf", width = 6, height = 4, units = "in")

This shows that the information given by the test is skewed toward the lower end of the ability scale - i.e. it can give more accurate estimates of students’ ability where their ability level is slightly below the mean.

Item information curves

Breaking this down by question helps to highlight those questions that are most/least informative:

item_info_data %>% 
  ggplot(aes(x = theta, y = info_y, colour = item)) +
  geom_line() +
  scale_colour_viridis_d(option = "plasma", end = 0.8, direction = -1) +
  facet_wrap(vars(item)) +
  labs(y = "Information") +
  theme_minimal()

We can also compute the sums of different subsets of the information curves – here, we will look at the questions based on their MATH group:

item_info_data %>% 
  group_by(theta) %>% 
  summarise(
    items_all = sum(info_y),
    items_A = sum(ifelse(MATH_group == "A", info_y, 0)),
    items_B = sum(ifelse(MATH_group == "B", info_y, 0))
  ) %>% 
  pivot_longer(cols = starts_with("items_"), names_to = "items", names_prefix = "items_", values_to = "info_y") %>% 
  mutate(items = fct_relevel(items, "all", "A", "B")) %>% 
  ggplot(aes(x = theta, y = info_y, colour = items)) +
  geom_line() +
  scale_colour_manual(values = c("all" = "#000000", MATH_colours)) +
  labs(x = "Ability", y = "Information") +
  theme_minimal()

ggsave("output/uoe_pre_info-curves_A-vs-B.pdf", units = "cm", width = 14, height = 6)

This shows that the information in the MATH Group B questions is at a higher point on the ability scale than for the MATH Group A questions.

Since the number of items in each case is different, we consider instead the mean information per item:

item_info_data %>% 
  group_by(theta) %>% 
  summarise(
    items_all = sum(info_y) / n(),
    items_A = sum(ifelse(MATH_group == "A", info_y, 0)) / sum(ifelse(MATH_group == "A", 1, 0)),
    items_B = sum(ifelse(MATH_group == "B", info_y, 0)) / sum(ifelse(MATH_group == "B", 1, 0))
  ) %>% 
  pivot_longer(cols = starts_with("items_"), names_to = "items", names_prefix = "items_", values_to = "info_y") %>% 
  mutate(items = fct_relevel(items, "all", "A", "B")) %>% 
  ggplot(aes(x = theta, y = info_y, colour = items)) +
  geom_line() +
  scale_colour_manual(values = c("all" = "#000000", MATH_colours)) +
  labs(x = "Ability", y = "Mean information per item") +
  theme_minimal()

ggsave("output/uoe_pre_info-curves_A-vs-B-avg.pdf", units = "cm", width = 10, height = 6)

This shows that items of each MATH group are giving broadly similar levels of information on average, but at different points on the ability scale.

Total information

Using mirt’s areainfo function, we can find the total area under the information curves.

info_gpcm <- areainfo(fit_gpcm, c(-4,4))
info_gpcm %>% gt()
LowerBound UpperBound Info TotalInfo Proportion nitems
-4 4 24.83953 27.35916 0.9079054 20

This shows that the total information in all items is 27.3591601.

Information by item

tidy_info <- item_info %>%
  mutate(item_num = row_number()) %>% 
  mutate(TotalInfo = purrr::map_dbl(
    item_num,
    ~ areainfo(fit_gpcm,
               c(-4, 4),
               which.items = .x) %>% pull(TotalInfo)
  ))

tidy_info %>%
  select(-item_num) %>% 
  arrange(-TotalInfo) %>% 
  #group_by(outcome) %>% 
  gt() %>% 
  fmt_number(columns = contains("a_"), decimals = 2) %>%
  fmt_number(columns = contains("b_"), decimals = 2) %>%
  data_color(
    columns = contains("info"),
    colors = scales::col_numeric(palette = c("Greens"), domain = NULL)
  ) %>%
  data_color(
    columns = contains("outcome"),
    colors = scales::col_factor(palette = c("viridis"), domain = NULL)
  ) %>%
  cols_label(
    TotalInfo = "Information"
  )
question description MATH_group Information
A4 completing the square A 2.7245913
A14 find minimum gradient of cubic B 2.5214357
A20 product rule with given values B 1.8097972
A16 trig chain rule A 1.7603377
A17 chain rule A 1.6518064
A19 area between curve and x-axis (in 2 parts) B 1.3729548
A1 properties of fractions A 1.3404446
A13 equation of tangent A 1.3388898
A12 find point with given slope A 1.3343117
A7 graphical vector sum B 1.3208283
A9 simplify logs A 1.2704844
A10 identify graph of rational functions B 1.2691976
A6 trig wave function A 1.2669441
A11 summing arithmetic progression A 1.1046449
A3 composition of functions B 1.0845542
A5 trig double angle formula A 1.0426419
A18 definite integral A 0.9973049
A15 find and classify stationary points of cubic A 0.9137891
A2 find intersection of lines A 0.7205042
A8 compute angle between 3d vectors A 0.5136975

Restricting instead to the range \(-2\leq\theta\leq2\):

tidy_info <- item_info %>%
  mutate(item_num = row_number()) %>% 
  mutate(TotalInfo = purrr::map_dbl(
    item_num,
    ~ areainfo(fit_gpcm,
               c(-2, 2),
               which.items = .x) %>% pull(Info)
  ))

tidy_info %>%
  select(-item_num) %>% 
  arrange(-TotalInfo) %>% 
  #group_by(outcome) %>% 
  gt() %>% 
  fmt_number(columns = contains("a_"), decimals = 2) %>%
  fmt_number(columns = contains("b_"), decimals = 2) %>%
  data_color(
    columns = contains("info"),
    colors = scales::col_numeric(palette = c("Greens"), domain = NULL)
  ) %>%
  data_color(
    columns = contains("outcome"),
    colors = scales::col_factor(palette = c("viridis"), domain = NULL)
  ) %>%
  cols_label(
    TotalInfo = "Information"
  )
question description MATH_group Information
A14 find minimum gradient of cubic B 2.0623030
A20 product rule with given values B 1.6990122
A4 completing the square A 1.5766109
A19 area between curve and x-axis (in 2 parts) B 1.2072840
A16 trig chain rule A 0.9726624
A7 graphical vector sum B 0.9548041
A13 equation of tangent A 0.8474773
A10 identify graph of rational functions B 0.8012734
A3 composition of functions B 0.7961748
A12 find point with given slope A 0.7761167
A6 trig wave function A 0.7645452
A17 chain rule A 0.6721487
A18 definite integral A 0.6604543
A9 simplify logs A 0.5914672
A5 trig double angle formula A 0.5284072
A15 find and classify stationary points of cubic A 0.4778491
A1 properties of fractions A 0.4665714
A11 summing arithmetic progression A 0.3821670
A8 compute angle between 3d vectors A 0.2295292
A2 find intersection of lines A 0.1924824

Response curves

Since the gpcm model is more complicated, there is a characteristic curve for each possible score on the question:

trace_data <- probtrace(fit_gpcm, theta) %>% 
  as_tibble() %>% 
  bind_cols(theta = theta) %>% 
  pivot_longer(cols = -theta, names_to = "level", values_to = "y") %>% 
  left_join(mark_levels %>% select(item, level = levelname, score), by = "level") %>% 
  mutate(score = as.factor(score))

trace_data %>% 
  mutate(item = fct_reorder(item, parse_number(item))) %>% 
  ggplot(aes(x = theta, y = y, colour = score)) +
  geom_line() +
  scale_colour_viridis_d(option = "plasma", end = 0.8, direction = -1) +
  facet_wrap(vars(item)) +
  labs(y = "Probability of response") +
  theme_minimal()

To get a simplified picture for each question, we compute the expected score at each ability level:

expected_scores <- trace_data %>% 
  mutate(item = fct_reorder(item, parse_number(item))) %>% 
  group_by(item, theta) %>% 
  summarise(expected_score = sum(as.double(as.character(score)) * y), .groups = "drop")

expected_scores %>% 
  ggplot(aes(x = theta, y = expected_score, colour = item)) +
  geom_line() +
  scale_colour_viridis_d(option = "plasma", end = 0.8, direction = -1) +
  facet_wrap(vars(item)) +
  labs(y = "Expected score") +
  theme_minimal()

The resulting curves look quite similar to those from the 2PL, allowing for some similar interpretation. For instance, superimposing all the curves shows that there is a spread of difficulties (i.e. thetas where the expected score is 2.5/5) and that some questions are more discriminating than others (i.e. steeper slopes):

plt <- expected_scores %>% 
  ggplot(aes(x = theta, y = expected_score, colour = item, text = item)) +
  geom_line() +
  scale_colour_viridis_d(option = "plasma", end = 0.8, direction = -1) +
  labs(y = "Expected score") +
  theme_minimal()

ggplotly(plt, tooltip = "text")
ggsave(plot = plt, file = "output/uoe_pre_iccs-superimposed.pdf", width = 20, height = 14, units = "cm")

Test response curve

total_expected_score <- expected_scores %>% 
  group_by(theta) %>% 
  summarise(expected_score = sum(expected_score))

total_expected_score %>% 
  ggplot(aes(x = theta, y = expected_score)) +
  geom_line() +
  # geom_point(data = total_expected_score %>% filter(theta == 0)) +
  # ggrepel::geom_label_repel(data = total_expected_score %>% filter(theta == 0), aes(label = round(expected_score, 1)), box.padding = 0.5) +
  scale_colour_viridis_d(option = "plasma", end = 0.8, direction = -1) +
  labs(y = "Expected score") +
  theme_minimal()

Packages

In this analysis we used the following packages. You can learn more about each one by clicking on the links below.

  • mirt: For IRT analysis
  • psych: For factor analysis
  • tidyverse: For data wrangling and visualisation
  • reshape: For reshaping nested lists
  • vroom: For reading in many files at once
  • broom: For tidying model output
  • fs: For file system operations
  • gt: For formatting tables
  • knitr: For markdown tables
  • ggrepel: For labelling points without overlap
  • skimr: For data frame level summary
  • ggridges: For ridge plots